First step to this assignment is to list all the libraries that will be used.
library(tidyverse)
## -- Attaching packages ------------------------- tidyverse 1.3.0 --
## v ggplot2 3.3.2 v purrr 0.3.4
## v tibble 3.0.3 v dplyr 1.0.2
## v tidyr 1.1.2 v stringr 1.4.0
## v readr 1.3.1 v forcats 0.5.0
## -- Conflicts ---------------------------- tidyverse_conflicts() --
## x dplyr::filter() masks stats::filter()
## x dplyr::lag() masks stats::lag()
library(plotly)
##
## Attaching package: 'plotly'
## The following object is masked from 'package:ggplot2':
##
## last_plot
## The following object is masked from 'package:stats':
##
## filter
## The following object is masked from 'package:graphics':
##
## layout
library(sf)
## Linking to GEOS 3.8.0, GDAL 3.0.4, PROJ 6.3.1
library(tigris)
## To enable
## caching of data, set `options(tigris_use_cache = TRUE)` in your R script or .Rprofile.
library(leaflet)
library("dplyr")
library("plyr")
## ------------------------------------------------------------------------------
## You have loaded plyr after dplyr - this is likely to cause problems.
## If you need functions from both plyr and dplyr, please load plyr first, then dplyr:
## library(plyr); library(dplyr)
## ------------------------------------------------------------------------------
##
## Attaching package: 'plyr'
## The following objects are masked from 'package:plotly':
##
## arrange, mutate, rename, summarise
## The following objects are masked from 'package:dplyr':
##
## arrange, count, desc, failwith, id, mutate, rename, summarise,
## summarize
## The following object is masked from 'package:purrr':
##
## compact
library("readr")
library(censusapi)
##
## Attaching package: 'censusapi'
## The following object is masked from 'package:methods':
##
## getFunction
library("rgdal")
## Loading required package: sp
## rgdal: version: 1.5-16, (SVN revision 1050)
## Geospatial Data Abstraction Library extensions to R successfully loaded
## Loaded GDAL runtime: GDAL 3.0.4, released 2020/01/28
## Path to GDAL shared files: C:/Users/mirei/OneDrive/Documents/R/win-library/4.0/rgdal/gdal
## GDAL binary built with GEOS: TRUE
## Loaded PROJ runtime: Rel. 6.3.1, February 10th, 2020, [PJ_VERSION: 631]
## Path to PROJ shared files: C:/Users/mirei/OneDrive/Documents/R/win-library/4.0/rgdal/proj
## Linking to sp version:1.4-2
## To mute warnings of possible GDAL/OSR exportToProj4() degradation,
## use options("rgdal_show_exportToProj4_warnings"="none") before loading rgdal.
library("sp")
library(zoo)
##
## Attaching package: 'zoo'
## The following objects are masked from 'package:base':
##
## as.Date, as.Date.numeric
After downloading the data, I created a stacked dataset that combines all the various csv files.
years <- 2017:2020
quarters <- 1:4 #there are four quarters in each year
types <- c("Electric", "Gas") #Electric and Gas each have different sections: residential, commercial, etc
pge_Elec_Gas_All <- NULL #creating the empty dataset
for (quarter in quarters){
for (year in years){
#since there are no quarter 3 and4 for year 2020 I need to make sure that the loop does not search for these specific quarters
if (year == 2020 & quarter == 3){
next()
}
if (year == 2020 & quarter == 4){
next()
}
for (type in types){
filename <-
paste0(
"PGE_",
year,
"_Q",
quarter,
"_",
type,
"UsageByZip.csv"
)
temp <- read_csv(filename)
temp$TOTALkBTU <- NULL #creating an empty column
temp$AVERAGEkBTU <- NULL #creating an empty column
temp$DATE <- as.yearmon(paste(temp$YEAR, temp$MONTH), "%Y %m")
if (type == "Gas"){ #calculating the kBTU (total and average) for gas
temp$TOTALkBTU <- temp$TOTALTHM * 100
temp$AVERAGEkBTU <- temp$TOTALkBTU/temp$TOTALCUSTOMERS
temp$TOTALTHM <- NULL
temp$AVERAGETHM <- NULL
}
if (type == "Electric"){ #calculating the kBTU (total and average) for electricity
temp$TOTALkBTU <- temp$TOTALKWH * 3.4121416
temp$AVERAGEkBTU <- temp$TOTALkBTU/temp$TOTALCUSTOMERS
temp$TOTALKWH <- NULL
temp$AVERAGEKWH <- NULL
}
pge_Elec_Gas_All <- rbind(pge_Elec_Gas_All, temp)
}
}
saveRDS(pge_Elec_Gas_All, "pge_Elec_Gas_All.rds")
}
## Parsed with column specification:
## cols(
## ZIPCODE = col_double(),
## MONTH = col_double(),
## YEAR = col_double(),
## CUSTOMERCLASS = col_character(),
## COMBINED = col_character(),
## TOTALCUSTOMERS = col_number(),
## TOTALKWH = col_number(),
## AVERAGEKWH = col_number()
## )
## Parsed with column specification:
## cols(
## ZIPCODE = col_double(),
## MONTH = col_double(),
## YEAR = col_double(),
## CUSTOMERCLASS = col_character(),
## COMBINED = col_character(),
## TOTALCUSTOMERS = col_number(),
## TOTALTHM = col_number(),
## AVERAGETHM = col_number()
## )
## Parsed with column specification:
## cols(
## ZIPCODE = col_double(),
## MONTH = col_double(),
## YEAR = col_double(),
## CUSTOMERCLASS = col_character(),
## COMBINED = col_character(),
## TOTALCUSTOMERS = col_number(),
## TOTALKWH = col_number(),
## AVERAGEKWH = col_number()
## )
## Parsed with column specification:
## cols(
## ZIPCODE = col_double(),
## MONTH = col_double(),
## YEAR = col_double(),
## CUSTOMERCLASS = col_character(),
## COMBINED = col_character(),
## TOTALCUSTOMERS = col_number(),
## TOTALTHM = col_number(),
## AVERAGETHM = col_number()
## )
## Parsed with column specification:
## cols(
## ZIPCODE = col_double(),
## MONTH = col_double(),
## YEAR = col_double(),
## CUSTOMERCLASS = col_character(),
## COMBINED = col_character(),
## TOTALCUSTOMERS = col_number(),
## TOTALKWH = col_number(),
## AVERAGEKWH = col_number()
## )
## Parsed with column specification:
## cols(
## ZIPCODE = col_double(),
## MONTH = col_double(),
## YEAR = col_double(),
## CUSTOMERCLASS = col_character(),
## COMBINED = col_character(),
## TOTALCUSTOMERS = col_number(),
## TOTALTHM = col_number(),
## AVERAGETHM = col_number()
## )
## Parsed with column specification:
## cols(
## ZIPCODE = col_double(),
## MONTH = col_double(),
## YEAR = col_double(),
## CUSTOMERCLASS = col_character(),
## COMBINED = col_character(),
## TOTALCUSTOMERS = col_number(),
## TOTALKWH = col_number(),
## AVERAGEKWH = col_number()
## )
## Parsed with column specification:
## cols(
## ZIPCODE = col_double(),
## MONTH = col_double(),
## YEAR = col_double(),
## CUSTOMERCLASS = col_character(),
## COMBINED = col_character(),
## TOTALCUSTOMERS = col_number(),
## TOTALTHM = col_number(),
## AVERAGETHM = col_number()
## )
## Parsed with column specification:
## cols(
## ZIPCODE = col_double(),
## MONTH = col_double(),
## YEAR = col_double(),
## CUSTOMERCLASS = col_character(),
## COMBINED = col_character(),
## TOTALCUSTOMERS = col_number(),
## TOTALKWH = col_number(),
## AVERAGEKWH = col_number()
## )
## Parsed with column specification:
## cols(
## ZIPCODE = col_double(),
## MONTH = col_double(),
## YEAR = col_double(),
## CUSTOMERCLASS = col_character(),
## COMBINED = col_character(),
## TOTALCUSTOMERS = col_number(),
## TOTALTHM = col_number(),
## AVERAGETHM = col_number()
## )
## Parsed with column specification:
## cols(
## ZIPCODE = col_double(),
## MONTH = col_double(),
## YEAR = col_double(),
## CUSTOMERCLASS = col_character(),
## COMBINED = col_character(),
## TOTALCUSTOMERS = col_number(),
## TOTALKWH = col_number(),
## AVERAGEKWH = col_number()
## )
## Parsed with column specification:
## cols(
## ZIPCODE = col_double(),
## MONTH = col_double(),
## YEAR = col_double(),
## CUSTOMERCLASS = col_character(),
## COMBINED = col_character(),
## TOTALCUSTOMERS = col_number(),
## TOTALTHM = col_number(),
## AVERAGETHM = col_number()
## )
## Parsed with column specification:
## cols(
## ZIPCODE = col_double(),
## MONTH = col_double(),
## YEAR = col_double(),
## CUSTOMERCLASS = col_character(),
## COMBINED = col_character(),
## TOTALCUSTOMERS = col_number(),
## TOTALKWH = col_number(),
## AVERAGEKWH = col_number()
## )
## Parsed with column specification:
## cols(
## ZIPCODE = col_double(),
## MONTH = col_double(),
## YEAR = col_double(),
## CUSTOMERCLASS = col_character(),
## COMBINED = col_character(),
## TOTALCUSTOMERS = col_number(),
## TOTALTHM = col_number(),
## AVERAGETHM = col_number()
## )
## Parsed with column specification:
## cols(
## ZIPCODE = col_double(),
## MONTH = col_double(),
## YEAR = col_double(),
## CUSTOMERCLASS = col_character(),
## COMBINED = col_character(),
## TOTALCUSTOMERS = col_number(),
## TOTALKWH = col_number(),
## AVERAGEKWH = col_number()
## )
## Parsed with column specification:
## cols(
## ZIPCODE = col_double(),
## MONTH = col_double(),
## YEAR = col_double(),
## CUSTOMERCLASS = col_character(),
## COMBINED = col_character(),
## TOTALCUSTOMERS = col_number(),
## TOTALTHM = col_number(),
## AVERAGETHM = col_number()
## )
## Parsed with column specification:
## cols(
## ZIPCODE = col_double(),
## MONTH = col_double(),
## YEAR = col_double(),
## CUSTOMERCLASS = col_character(),
## COMBINED = col_character(),
## TOTALCUSTOMERS = col_number(),
## TOTALKWH = col_number(),
## AVERAGEKWH = col_number()
## )
## Parsed with column specification:
## cols(
## ZIPCODE = col_double(),
## MONTH = col_double(),
## YEAR = col_double(),
## CUSTOMERCLASS = col_character(),
## COMBINED = col_character(),
## TOTALCUSTOMERS = col_number(),
## TOTALTHM = col_number(),
## AVERAGETHM = col_double()
## )
## Parsed with column specification:
## cols(
## ZIPCODE = col_double(),
## MONTH = col_double(),
## YEAR = col_double(),
## CUSTOMERCLASS = col_character(),
## COMBINED = col_character(),
## TOTALCUSTOMERS = col_number(),
## TOTALKWH = col_number(),
## AVERAGEKWH = col_number()
## )
## Parsed with column specification:
## cols(
## ZIPCODE = col_double(),
## MONTH = col_double(),
## YEAR = col_double(),
## CUSTOMERCLASS = col_character(),
## COMBINED = col_character(),
## TOTALCUSTOMERS = col_number(),
## TOTALTHM = col_number(),
## AVERAGETHM = col_double()
## )
## Parsed with column specification:
## cols(
## ZIPCODE = col_double(),
## MONTH = col_double(),
## YEAR = col_double(),
## CUSTOMERCLASS = col_character(),
## COMBINED = col_character(),
## TOTALCUSTOMERS = col_number(),
## TOTALKWH = col_number(),
## AVERAGEKWH = col_number()
## )
## Parsed with column specification:
## cols(
## ZIPCODE = col_double(),
## MONTH = col_double(),
## YEAR = col_double(),
## CUSTOMERCLASS = col_character(),
## COMBINED = col_character(),
## TOTALCUSTOMERS = col_number(),
## TOTALTHM = col_number(),
## AVERAGETHM = col_number()
## )
## Parsed with column specification:
## cols(
## ZIPCODE = col_double(),
## MONTH = col_double(),
## YEAR = col_double(),
## CUSTOMERCLASS = col_character(),
## COMBINED = col_character(),
## TOTALCUSTOMERS = col_number(),
## TOTALKWH = col_number(),
## AVERAGEKWH = col_number()
## )
## Parsed with column specification:
## cols(
## ZIPCODE = col_double(),
## MONTH = col_double(),
## YEAR = col_double(),
## CUSTOMERCLASS = col_character(),
## COMBINED = col_character(),
## TOTALCUSTOMERS = col_number(),
## TOTALTHM = col_number(),
## AVERAGETHM = col_number()
## )
## Parsed with column specification:
## cols(
## ZIPCODE = col_double(),
## MONTH = col_double(),
## YEAR = col_double(),
## CUSTOMERCLASS = col_character(),
## COMBINED = col_character(),
## TOTALCUSTOMERS = col_number(),
## TOTALKWH = col_number(),
## AVERAGEKWH = col_number()
## )
## Parsed with column specification:
## cols(
## ZIPCODE = col_double(),
## MONTH = col_double(),
## YEAR = col_double(),
## CUSTOMERCLASS = col_character(),
## COMBINED = col_character(),
## TOTALCUSTOMERS = col_number(),
## TOTALTHM = col_number(),
## AVERAGETHM = col_number()
## )
## Parsed with column specification:
## cols(
## ZIPCODE = col_double(),
## MONTH = col_double(),
## YEAR = col_double(),
## CUSTOMERCLASS = col_character(),
## COMBINED = col_character(),
## TOTALCUSTOMERS = col_number(),
## TOTALKWH = col_number(),
## AVERAGEKWH = col_number()
## )
## Parsed with column specification:
## cols(
## ZIPCODE = col_double(),
## MONTH = col_double(),
## YEAR = col_double(),
## CUSTOMERCLASS = col_character(),
## COMBINED = col_character(),
## TOTALCUSTOMERS = col_number(),
## TOTALTHM = col_number(),
## AVERAGETHM = col_number()
## )
head("pge_Elec_Gas_All.rds")
## [1] "pge_Elec_Gas_All.rds"
Once I have created the stacked dataset, I will filter the data to have only the Bay Area zipcodes.
options(tigris_use_cache = FALSE)
ca_counties <- counties("CA", cb = T, progress_bar = F)
#st_drivers()
bay_county_names <-
c(
"Alameda",
"Contra Costa",
"Marin",
"Napa",
"San Francisco",
"San Mateo",
"Santa Clara",
"Solano",
"Sonoma"
)
bay_counties <-
ca_counties %>%
filter(NAME %in% bay_county_names)
usa_zips <-
zctas(cb = T, progress_bar = F)
#creates a dataframe of all bay area zipcodes
bay_zips <-
usa_zips %>%
st_centroid() %>%
.[bay_counties, ] %>%
st_set_geometry(NULL) %>%
left_join(usa_zips %>% select(GEOID10)) %>%
st_as_sf() #brings back coordinate reference systems
## Warning in st_centroid.sf(.): st_centroid assumes attributes are constant over
## geometries of x
## Warning in st_centroid.sfc(st_geometry(x), of_largest_polygon =
## of_largest_polygon): st_centroid does not give correct centroids for longitude/
## latitude data
## although coordinates are longitude/latitude, st_intersects assumes that they are planar
## although coordinates are longitude/latitude, st_intersects assumes that they are planar
## Joining, by = "GEOID10"
#now actually filtering the datasets that have the Bay Area zipcodes
bayAreaPGE<- subset(pge_Elec_Gas_All, pge_Elec_Gas_All$ZIPCODE %in% bay_zips$ZCTA5CE10)
After creating a dataset that only includes the Bay area zipcodes, I will filter the dataset to only include gas and electric for residential and commercial use.
bayAreaPGE_elec_final <-
bayAreaPGE %>%
# subset(pge_Elec_Gas_All, pge_Elec_Gas_All$ZIPCODE %in% bay_zips$ZCTA5CE10)%>%
filter( #choose only these customer classes
CUSTOMERCLASS %in%
c(
"Elec- Residential",
"Elec- Commercial", "Gas- Residential", "Gas- Commercial" #I could add gas data in here but I would have to change the subset
)
) %>%
select(
!c(COMBINED, AVERAGEkBTU)#don't include these columns
) %>%
dplyr::group_by(DATE, CUSTOMERCLASS) %>% #grouping by residential or commercial gas/electric and by date/month
dplyr::summarise(
TOTALkBTU =
sum(
TOTALkBTU,
na.rm = T
),
TOTALCUSTOMERS =
sum(
TOTALCUSTOMERS,
na.rm = T
)
) %>%
mutate(
AVERAGEkBTU =
TOTALkBTU/TOTALCUSTOMERS
)
## `summarise()` regrouping output by 'DATE' (override with `.groups` argument)
bayAreaPGE_elec_final
## # A tibble: 168 x 5
## # Groups: DATE [42]
## DATE CUSTOMERCLASS TOTALkBTU TOTALCUSTOMERS AVERAGEkBTU
## <yearmon> <chr> <dbl> <dbl> <dbl>
## 1 Jan 2017 Elec- Commercial 5632400479. 174860 32211.
## 2 Jan 2017 Elec- Residential 4803652706. 2495640 1925.
## 3 Jan 2017 Gas- Commercial 4604768900 89496 51452.
## 4 Jan 2017 Gas- Residential 17948726200 2179812 8234.
## 5 Feb 2017 Elec- Commercial 4687467036. 165585 28309.
## 6 Feb 2017 Elec- Residential 3821658257. 2497198 1530.
## 7 Feb 2017 Gas- Commercial 3292140900 89236 36893.
## 8 Feb 2017 Gas- Residential 11845495100 2180941 5431.
## 9 Mar 2017 Elec- Commercial 4906893462. 161665 30352.
## 10 Mar 2017 Elec- Residential 3697269785. 2498373 1480.
## # ... with 158 more rows
Create the graph
Now that I have my dataset that shows the monthly total kBTUs of residential and commercial electricity and gas consumption for the Bay Area from 2017 to 2020 quarter 2, I will create a bar graph to visualize this.
pge_chart <-
bayAreaPGE_elec_final %>%
ggplot() +
geom_bar(
aes(
x = DATE %>% factor(),
y = TOTALkBTU,
fill = CUSTOMERCLASS
),
stat = "identity",
position = "stack"
) +
labs(
x = "Date",
y = "kBTU",
title = "PG&E Bay Area Monthly Electricity and Gas Usage, 2017-2020",
fill = "Customer Class Type"
) +
theme(
axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1)#to shift the dates so they're visible
)
pge_chart
pge_chart %>% ggplotly()
Q:Comment on any observable changes in energy consumption that may be attributable to the COVID-19 pandemic. A: Looking at the total kBTUs for March from 2019 to 2020, the electricity usage from commercial decreased by 357.5 million kBTU (total consumption) which could be due to businesses having to close down towards the middle of March. In regards to residential, electricity usage increased by roughly 150=1.2 million kBTU (total kBTU) and gas usage also increased 96.6 million kBTU. This increase in consumption can be due to people being laid off and activities being shut down which led to people staying at home longer than usual.
For the second part of the assignment, I created a dataset that calculates the change of electricity consumption due to COVID-19. I assumed that the best way to do this was to find the change of electricity consumption by subtracting 2020’s data with 2019’s data and dividing that value by 2019’s data value. I also assumed that the way to look at neighborhoods who experienced the greatest change in electricity consumption was to focus on residential consumption specifically as oppose to all of electricity consumption. I decided to keep the values as kBTU to make it easier to analyze. I removed any values that had 0 for 2019’s electricity residential usage.
bayAreaCOVID <-
bayAreaPGE %>%
filter(
YEAR %in% (2019:2020), MONTH %in% (1:5), CUSTOMERCLASS == "Elec- Residential"
) %>%
mutate(
ZIPCODE = ZIPCODE %>% as.character()
) %>%
dplyr::group_by(ZIPCODE, YEAR) %>%
dplyr::summarise(
TOTALkBTU =
sum(
TOTALkBTU,
na.rm = T
)
) %>%
right_join(
bay_zips %>% select(GEOID10),
by = c("ZIPCODE" = "GEOID10")
) %>%
pivot_wider(
names_from = YEAR,
names_prefix = "kBTU",
values_from = TOTALkBTU
) %>%
dplyr::filter(kBTU2019> 0) %>%
mutate(
COVID_influence =
((kBTU2019 - kBTU2020)/ kBTU2019) * 100
) %>%
st_as_sf()%>%
st_transform(4326)
## `summarise()` regrouping output by 'ZIPCODE' (override with `.groups` argument)
bayAreaCOVID
## Simple feature collection with 267 features and 5 fields
## geometry type: MULTIPOLYGON
## dimension: XY
## bbox: xmin: -123.5335 ymin: 36.89303 xmax: -121.3083 ymax: 38.93713
## geographic CRS: WGS 84
## # A tibble: 267 x 6
## # Groups: ZIPCODE [267]
## ZIPCODE geometry kBTU2019 kBTU2020 kBTUNA COVID_influence
## * <chr> <MULTIPOLYGON [°]> <dbl> <dbl> <dbl> <dbl>
## 1 94002 (((-122.3359 37.50805, -122~ 7.59e7 7.75e7 NA -2.18
## 2 94005 (((-122.442 37.69435, -122.~ 1.28e7 1.33e7 NA -4.33
## 3 94010 (((-122.4126 37.58916, -122~ 1.65e8 1.69e8 NA -2.17
## 4 94014 (((-122.4718 37.70225, -122~ 9.65e7 1.01e8 NA -5.17
## 5 94015 (((-122.4983 37.70813, -122~ 1.27e8 1.33e8 NA -4.77
## 6 94019 (((-122.5113 37.52301, -122~ 4.22e7 4.22e7 NA 0.0134
## 7 94020 (((-122.3284 37.3143, -122.~ 7.97e6 8.17e6 NA -2.51
## 8 94022 (((-122.1855 37.32374, -122~ 9.57e7 9.69e7 NA -1.30
## 9 94024 (((-122.1416 37.34951, -122~ 8.78e7 9.00e7 NA -2.50
## 10 94025 (((-122.2291 37.4243, -122.~ 1.33e8 1.38e8 NA -3.90
## # ... with 257 more rows
Once I created the dataset, I created the map to visualize the data.
res_pal <- colorNumeric(
palette = "Blues",
domain =
bayAreaCOVID$COVID_influence
)
leaflet() %>% #leaflet wants a spatial object
addTiles() %>%
addPolygons(
data = bayAreaCOVID,
fillColor = ~res_pal(COVID_influence),
color = "white",
opacity = 0.5,
fillOpacity = 0.5,
weight = 1,
label = ~paste0(
round(COVID_influence),
"kBTU Percent Change in",
ZIPCODE
),
highlightOptions = highlightOptions(
weight = 2,
opacity = 1
)
)%>%
addLegend(
data = bayAreaCOVID,
pal = res_pal,
values = ~COVID_influence,
title = "Percent Change in Residential<br>Electric Energy Usage possibly<br>due to COVID-19"
)
Based on this map, the neighborhood that experienced the greatest change was a neighborhood in Oakland with a 63% change.